<?php
/*
 * 居于Unicode编码词典的php分词器
 *  1、只适用于php5，必要函数 iconv
 *  2、本程序是使用RMM逆向匹配算法进行分词的，词库需要特别编译，本类里提供了 MakeDict() 方法
 *  3、简单操作流程： SetSource -> StartAnalysis -> Get***Result
 *  4、对主词典使用特殊格式进行编码, 不需要载入词典到内存操作
 *
 * Copyright IT柏拉图  QQ: 2500875 Email: 2500875#qq.com
 *
 * @version 2.0
 *
 */

//常量定义
define('_SP_', chr(0xFF) . chr(0xFE));
define('UCS2', 'ucs-2be');
class PhpAnalysis
{
	
	//hash算法选项
	public $mask_value = 0x000F; //Default:0xFFFF
	
	//输入和输出的字符编码（只允许 utf-8、gbk/gb2312/gb18030、big5 三种类型）  
	public $sourceCharSet = 'utf-8';
	public $targetCharSet = 'utf-8';
	
	//生成的分词结果数据类型 1 为全部， 2为 词典词汇及单个中日韩简繁字符及英文， 3 为词典词汇及英文
	public $resultType = 1;
	
	//句子长度小于这个数值时不拆分，notSplitLen = n(个汉字) * 2 + 1
	public $notSplitLen = 5;
	
	//把英文单词全部转小写
	public $toLower = false;
	
	//使用最大切分模式对二元词进行消岐
	public $differMax = false;
	
	//尝试合并单字
	public $unitWord = true;
	
	//初始化类时直接加载词典
	public static $loadInit = false;
	
	//使用热门词优先模式进行消岐
	public $differFreq = false;
	
	//被转换为unicode的源字符串
	private $sourceString = '';
	
	//附加词典
	public $addonDic = array();
	public $addonDicFile = 'dict/words_addons.dic';
	
	//主词典 
	public $dicStr = '';
	public $mainDic = array();
	public $mainDicHand = false;
	public $mainDicInfos = array();
	public $mainDicFile = 'dict/base_dic_full.dic';
	//是否直接载入词典（选是载入速度较慢，但解析较快；选否载入较快，但解析较慢，需要时才会载入特定的词条）
	private $isLoadAll = false;
	
	//主词典词语最大长度 x / 2
	private $dicWordMax = 14;
	//粗分后的数组（通常是截取句子等用途）
	private $simpleResult = array();
	//最终结果(用空格分开的词汇列表)
	private $finallyResult = '';
	
	//是否已经载入词典
	public $isLoadDic = false;
	//系统识别或合并的新词
	public $newWords = array();
	public $foundWordStr = '';
	//词库载入时间
	public $loadTime = 0;
	
	/**
	 * 构造函数
	 * @param $source_charset
	 * @param $target_charset
	 * @param $load_alldic 
	 * @param $source
	 *
	 * @return void
	 */
	public function __construct($source_charset = 'utf-8', $target_charset = 'utf-8', $load_all = true, $source = '')
	{
		$this->addonDicFile = __DIR__ . '/' . $this->addonDicFile;
		$this->mainDicFile  = __DIR__ . '/' . $this->mainDicFile;
		$this->SetSource($source, $source_charset, $target_charset);
		$this->isLoadAll = $load_all;
		if (self::$loadInit)
			$this->LoadDict();
	}
	
	/**
	 * 析构函数
	 */
	function __destruct()
	{
		if ($this->mainDicHand !== false) {
			@fclose($this->mainDicHand);
		}
	}
	
	/**
	 * 根据字符串计算key索引
	 * @param $key
	 * @return short int
	 */
	private function _get_index($key)
	{
		$l = strlen($key);
		$h = 0x238f13af;
		while ($l--) {
			$h += ($h << 5);
			$h ^= ord($key[$l]);
			$h &= 0x7fffffff;
		}
		return ($h % $this->mask_value);
	}
	
	/**
	 * 从文件获得词
	 * @param $key
	 * @param $type (类型 word 或 key_groups)
	 * @return short int
	 */
	public function GetWordInfos($key, $type = 'word')
	{
		if (!$this->mainDicHand) {
			$this->mainDicHand = fopen($this->mainDicFile, 'r');
		}
		$p      = 0;
		$keynum = $this->_get_index($key);
		if (isset($this->mainDicInfos[$keynum])) {
			$data = $this->mainDicInfos[$keynum];
		} else {
			//rewind( $this->mainDicHand );
			$move_pos = $keynum * 8;
			fseek($this->mainDicHand, $move_pos, SEEK_SET);
			$dat = fread($this->mainDicHand, 8);
			$arr = unpack('I1s/n1l/n1c', $dat);
			if ($arr['l'] == 0) {
				return false;
			}
			fseek($this->mainDicHand, $arr['s'], SEEK_SET);
			$data                        = @unserialize(fread($this->mainDicHand, $arr['l']));
			$this->mainDicInfos[$keynum] = $data;
		}
		if (!is_array($data) || !isset($data[$key])) {
			return false;
		}
		return ($type == 'word' ? $data[$key] : $data);
	}
	
	/**
	 * 设置源字符串
	 * @param $source
	 * @param $source_charset
	 * @param $target_charset
	 *
	 * @return bool
	 */
	public function SetSource($source, $source_charset = 'utf-8', $target_charset = 'utf-8')
	{
		$this->sourceCharSet = strtolower($source_charset);
		$this->targetCharSet = strtolower($target_charset);
		$this->simpleResult  = array();
		$this->finallyResult = array();
		$this->finallyIndex  = array();
		if ($source != '') {
			$rs = true;
			if (preg_match("/^utf/", $source_charset)) {
				$this->sourceString = iconv('utf-8', UCS2, $source);
			} else if (preg_match("/^gb/", $source_charset)) {
				$this->sourceString = iconv('utf-8', UCS2, iconv('gb18030', 'utf-8', $source));
			} else if (preg_match("/^big/", $source_charset)) {
				$this->sourceString = iconv('utf-8', UCS2, iconv('big5', 'utf-8', $source));
			} else {
				$rs = false;
			}
		} else {
			$rs = false;
		}
		return $rs;
	}
	
	/**
	 * 设置结果类型(只在获取finallyResult才有效)
	 * @param $rstype 1 为全部， 2去除特殊符号
	 *
	 * @return void
	 */
	public function SetResultType($rstype)
	{
		$this->resultType = $rstype;
	}
	
	/**
	 * 载入词典
	 *
	 * @return void
	 */
	public function LoadDict($maindic = '')
	{
		$startt   = microtime(true);
		//正常读取文件
		$dicAddon = $this->addonDicFile;
		if ($maindic == '' || !file_exists($maindic)) {
			$dicWords = $this->mainDicFile;
		} else {
			$dicWords          = $maindic;
			$this->mainDicFile = $maindic;
		}
		
		//加载主词典（只打开）
		$this->mainDicHand = fopen($dicWords, 'r');
		
		//载入副词典
		$hw = '';
		$ds = file($dicAddon);
		foreach ($ds as $d) {
			$d = trim($d);
			if ($d == '')
				continue;
			$estr = substr($d, 1, 1);
			if ($estr == ':') {
				$hw = substr($d, 0, 1);
			} else {
				$spstr = _SP_;
				$spstr = iconv(UCS2, 'utf-8', $spstr);
				$ws    = explode(',', $d);
				$wall  = iconv('utf-8', UCS2, join($spstr, $ws));
				$ws    = explode(_SP_, $wall);
				foreach ($ws as $estr) {
					$this->addonDic[$hw][$estr] = strlen($estr);
				}
			}
		}
		$this->loadTime  = microtime(true) - $startt;
		$this->isLoadDic = true;
	}
	
	/**
	 * 检测某个词是否存在
	 */
	public function IsWord($word)
	{
		$winfos = $this->GetWordInfos($word);
		return ($winfos !== false);
	}
	
	/**
	 * 获得某个词的词性及词频信息
	 * @parem $word unicode编码的词
	 * @return void
	 */
	public function GetWordProperty($word)
	{
		if (strlen($word) < 4) {
			return '/s';
		}
		$infos = $this->GetWordInfos($word);
		return isset($infos[1]) ? "/{$infos[1]}{$infos[0]}" : "/s";
	}
	
	/**
	 * 指定某词的词性信息（通常是新词）
	 * @parem $word unicode编码的词
	 * @parem $infos array('c' => 词频, 'm' => 词性);
	 * @return void;
	 */
	public function SetWordInfos($word, $infos)
	{
		if (strlen($word) < 4) {
			return;
		}
		if (isset($this->mainDicInfos[$word])) {
			$this->newWords[$word]++;
			$this->mainDicInfos[$word]['c']++;
		} else {
			$this->newWords[$word]     = 1;
			$this->mainDicInfos[$word] = $infos;
		}
	}
	
	/**
	 * 开始执行分析
	 * @parem bool optimize 是否对结果进行优化
	 * @return bool
	 */
	public function StartAnalysis($optimize = true)
	{
		if (!$this->isLoadDic) {
			$this->LoadDict();
		}
		$this->simpleResult = $this->finallyResult = array();
		$this->sourceString .= chr(0) . chr(32);
		$slen   = strlen($this->sourceString);
		$sbcArr = array();
		$j      = 0;
		//全角与半角字符对照表
		for ($i = 0xFF00; $i < 0xFF5F; $i++) {
			$scb = 0x20 + $j;
			$j++;
			$sbcArr[$i] = $scb;
		}
		//对字符串进行粗分
		$onstr          = '';
		$lastc          = 1; //1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
		$s              = 0;
		$ansiWordMatch  = "[0-9a-z@#%\+\.-]";
		$notNumberMatch = "[a-z@#%\+]";
		for ($i = 0; $i < $slen; $i++) {
			$c  = $this->sourceString[$i] . $this->sourceString[++$i];
			$cn = hexdec(bin2hex($c));
			$cn = isset($sbcArr[$cn]) ? $sbcArr[$cn] : $cn;
			//ANSI字符
			if ($cn < 0x80) {
				if (preg_match('/' . $ansiWordMatch . '/i', chr($cn))) {
					if ($lastc != 2 && $onstr != '') {
						$this->simpleResult[$s]['w'] = $onstr;
						$this->simpleResult[$s]['t'] = $lastc;
						$this->_deep_analysis($onstr, $lastc, $s, $optimize);
						$s++;
						$onstr = '';
					}
					$lastc = 2;
					$onstr .= chr(0) . chr($cn);
				} else {
					if ($onstr != '') {
						$this->simpleResult[$s]['w'] = $onstr;
						if ($lastc == 2) {
							if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
								$lastc = 4;
						}
						$this->simpleResult[$s]['t'] = $lastc;
						if ($lastc != 4)
							$this->_deep_analysis($onstr, $lastc, $s, $optimize);
						$s++;
					}
					$onstr = '';
					$lastc = 3;
					if ($cn < 31) {
						continue;
					} else {
						$this->simpleResult[$s]['w'] = chr(0) . chr($cn);
						$this->simpleResult[$s]['t'] = 3;
						$s++;
					}
				}
			}
			//普通字符
			else {
				//正常文字
				if (($cn > 0x3FFF && $cn < 0x9FA6) || ($cn > 0xF8FF && $cn < 0xFA2D) || ($cn > 0xABFF && $cn < 0xD7A4) || ($cn > 0x3040 && $cn < 0x312B)) {
					if ($lastc != 1 && $onstr != '') {
						$this->simpleResult[$s]['w'] = $onstr;
						if ($lastc == 2) {
							if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
								$lastc = 4;
						}
						$this->simpleResult[$s]['t'] = $lastc;
						if ($lastc != 4)
							$this->_deep_analysis($onstr, $lastc, $s, $optimize);
						$s++;
						$onstr = '';
					}
					$lastc = 1;
					$onstr .= $c;
				}
				//特殊符号
				else {
					if ($onstr != '') {
						$this->simpleResult[$s]['w'] = $onstr;
						if ($lastc == 2) {
							if (!preg_match('/' . $notNumberMatch . '/i', iconv(UCS2, 'utf-8', $onstr)))
								$lastc = 4;
						}
						$this->simpleResult[$s]['t'] = $lastc;
						if ($lastc != 4)
							$this->_deep_analysis($onstr, $lastc, $s, $optimize);
						$s++;
					}
					
					//检测书名
					if ($cn == 0x300A) {
						$tmpw = '';
						$n    = 1;
						$isok = false;
						$ew   = chr(0x30) . chr(0x0B);
						while (true) {
							$w = $this->sourceString[$i + $n] . $this->sourceString[$i + $n + 1];
							if ($w == $ew) {
								$this->simpleResult[$s]['w'] = $c;
								$this->simpleResult[$s]['t'] = 5;
								$s++;
								
								$this->simpleResult[$s]['w'] = $tmpw;
								$this->newWords[$tmpw]       = 1;
								if (!isset($this->newWords[$tmpw])) {
									$this->foundWordStr .= $this->_out_string_encoding($tmpw) . '/nb, ';
									$this->SetWordInfos($tmpw, array(
										'c' => 1,
										'm' => 'nb'
									));
								}
								$this->simpleResult[$s]['t'] = 13;
								
								$s++;
								
								//最大切分模式对书名继续分词
								if ($this->differMax) {
									$this->simpleResult[$s]['w'] = $tmpw;
									$this->simpleResult[$s]['t'] = 21;
									$this->_deep_analysis($tmpw, $lastc, $s, $optimize);
									$s++;
								}
								
								$this->simpleResult[$s]['w'] = $ew;
								$this->simpleResult[$s]['t'] = 5;
								$s++;
								
								$i     = $i + $n + 1;
								$isok  = true;
								$onstr = '';
								$lastc = 5;
								break;
							} else {
								$n = $n + 2;
								$tmpw .= $w;
								if (strlen($tmpw) > 60) {
									break;
								}
							}
						} //while
						if (!$isok) {
							$this->simpleResult[$s]['w'] = $c;
							$this->simpleResult[$s]['t'] = 5;
							$s++;
							$onstr = '';
							$lastc = 5;
						}
						continue;
					}
					
					$onstr = '';
					$lastc = 5;
					if ($cn == 0x3000) {
						continue;
					} else {
						$this->simpleResult[$s]['w'] = $c;
						$this->simpleResult[$s]['t'] = 5;
						$s++;
					}
				} //2byte symbol
				
			} //end 2byte char
			
		} //end for
		
		//处理分词后的结果
		$this->_sort_finally_result();
	}
	
	/**
	 * 深入分词
	 * @parem $str
	 * @parem $ctype (2 英文类， 3 中/韩/日文类)
	 * @parem $spos   当前粗分结果游标
	 * @return bool
	 */
	private function _deep_analysis(&$str, $ctype, $spos, $optimize = true)
	{
		
		//中文句子
		if ($ctype == 1) {
			$slen = strlen($str);
			//小于系统配置分词要求长度的句子
			if ($slen < $this->notSplitLen) {
				$tmpstr   = '';
				$lastType = 0;
				if ($spos > 0)
					$lastType = $this->simpleResult[$spos - 1]['t'];
				if ($slen < 5) {
					//echo iconv(UCS2, 'utf-8', $str).'<br/>';
					if ($lastType == 4 && (isset($this->addonDic['u'][$str]) || isset($this->addonDic['u'][substr($str, 0, 2)]))) {
						$str2 = '';
						if (!isset($this->addonDic['u'][$str]) && isset($this->addonDic['s'][substr($str, 2, 2)])) {
							$str2 = substr($str, 2, 2);
							$str  = substr($str, 0, 2);
						}
						$ww                                 = $this->simpleResult[$spos - 1]['w'] . $str;
						$this->simpleResult[$spos - 1]['w'] = $ww;
						$this->simpleResult[$spos - 1]['t'] = 4;
						if (!isset($this->newWords[$this->simpleResult[$spos - 1]['w']])) {
							$this->foundWordStr .= $this->_out_string_encoding($ww) . '/mu, ';
							$this->SetWordInfos($ww, array(
								'c' => 1,
								'm' => 'mu'
							));
						}
						$this->simpleResult[$spos]['w'] = '';
						if ($str2 != '') {
							$this->finallyResult[$spos - 1][] = $ww;
							$this->finallyResult[$spos - 1][] = $str2;
						}
					} else {
						$this->finallyResult[$spos][] = $str;
					}
				} else {
					$this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
				}
			}
			//正常长度的句子，循环进行分词处理
			else {
				$this->_deep_analysis_cn($str, $ctype, $spos, $slen, $optimize);
			}
		}
		//英文句子，转为小写
		else {
			if ($this->toLower) {
				$this->finallyResult[$spos][] = strtolower($str);
			} else {
				$this->finallyResult[$spos][] = $str;
			}
		}
	}
	
	/**
	 * 中文的深入分词
	 * @parem $str
	 * @return void
	 */
	private function _deep_analysis_cn(&$str, $lastec, $spos, $slen, $optimize = true)
	{
		$quote1 = chr(0x20) . chr(0x1C);
		$tmparr = array();
		$hasw   = 0;
		//如果前一个词为 “ ， 并且字符串小于3个字符当成一个词处理。
		if ($spos > 0 && $slen < 11 && $this->simpleResult[$spos - 1]['w'] == $quote1) {
			$tmparr[] = $str;
			if (!isset($this->newWords[$str])) {
				$this->foundWordStr .= $this->_out_string_encoding($str) . '/nq, ';
				$this->SetWordInfos($str, array(
					'c' => 1,
					'm' => 'nq'
				));
			}
			if (!$this->differMax) {
				$this->finallyResult[$spos][] = $str;
				return;
			}
		}
		//进行切分
		for ($i = $slen - 1; $i > 0; $i -= 2) {
			//单个词
			$nc = $str[$i - 1] . $str[$i];
			//是否已经到最后两个字
			if ($i <= 2) {
				$tmparr[] = $nc;
				$i        = 0;
				break;
			}
			$isok = false;
			$i    = $i + 1;
			for ($k = $this->dicWordMax; $k > 1; $k = $k - 2) {
				if ($i < $k)
					continue;
				$w = substr($str, $i - $k, $k);
				if (strlen($w) <= 2) {
					$i = $i - 1;
					break;
				}
				if ($this->IsWord($w)) {
					$tmparr[] = $w;
					$i        = $i - $k + 1;
					$isok     = true;
					break;
				}
			}
			//echo '<hr />';
			//没适合词
			if (!$isok)
				$tmparr[] = $nc;
		}
		$wcount = count($tmparr);
		if ($wcount == 0)
			return;
		$this->finallyResult[$spos] = array_reverse($tmparr);
		//优化结果(岐义处理、新词、数词、人名识别等)
		if ($optimize) {
			$this->_optimize_result($this->finallyResult[$spos], $spos);
		}
	}
	
	/**
	 * 对最终分词结果进行优化（把simpleresult结果合并，并尝试新词识别、数词合并等）
	 * @parem $optimize 是否优化合并的结果
	 * @return bool
	 */
	//t = 1 中/韩/日文, 2 英文/数字/符号('.', '@', '#', '+'), 3 ANSI符号 4 纯数字 5 非ANSI符号或不支持字符
	private function _optimize_result(&$smarr, $spos)
	{
		$newarr = array();
		$prePos = $spos - 1;
		$arlen  = count($smarr);
		$i      = $j = 0;
		//检测数量词
		if ($prePos > -1 && !isset($this->finallyResult[$prePos])) {
			$lastw = $this->simpleResult[$prePos]['w'];
			$lastt = $this->simpleResult[$prePos]['t'];
			if (($lastt == 4 || isset($this->addonDic['c'][$lastw])) && isset($this->addonDic['u'][$smarr[0]])) {
				$this->simpleResult[$prePos]['w'] = $lastw . $smarr[0];
				$this->simpleResult[$prePos]['t'] = 4;
				if (!isset($this->newWords[$this->simpleResult[$prePos]['w']])) {
					$this->foundWordStr .= $this->_out_string_encoding($this->simpleResult[$prePos]['w']) . '/mu, ';
					$this->SetWordInfos($this->simpleResult[$prePos]['w'], array(
						'c' => 1,
						'm' => 'mu'
					));
				}
				$smarr[0] = '';
				$i++;
			}
		}
		for (; $i < $arlen; $i++) {
			
			if (!isset($smarr[$i + 1])) {
				$newarr[$j] = $smarr[$i];
				break;
			}
			$cw      = $smarr[$i];
			$nw      = $smarr[$i + 1];
			$ischeck = false;
			//检测数量词
			if (isset($this->addonDic['c'][$cw]) && isset($this->addonDic['u'][$nw])) {
				//最大切分时保留合并前的词
				if ($this->differMax) {
					$newarr[$j] = chr(0) . chr(0x28);
					$j++;
					$newarr[$j] = $cw;
					$j++;
					$newarr[$j] = $nw;
					$j++;
					$newarr[$j] = chr(0) . chr(0x29);
					$j++;
				}
				$newarr[$j] = $cw . $nw;
				if (!isset($this->newWords[$newarr[$j]])) {
					$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/mu, ';
					$this->SetWordInfos($newarr[$j], array(
						'c' => 1,
						'm' => 'mu'
					));
				}
				$j++;
				$i++;
				$ischeck = true;
			}
			//检测前导词(通常是姓)
			else if (isset($this->addonDic['n'][$smarr[$i]])) {
				$is_rs = false;
				//词语是副词或介词或频率很高的词不作为人名
				if (strlen($nw) == 4) {
					$winfos = $this->GetWordInfos($nw);
					if (isset($winfos['m']) && ($winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
						$is_rs = true;
					}
				}
				if (!isset($this->addonDic['s'][$nw]) && strlen($nw) < 5 && !$is_rs) {
					$newarr[$j] = $cw . $nw;
					//echo iconv(UCS2, 'utf-8', $newarr[$j])."<br />";
					//尝试检测第三个词
					if (strlen($nw) == 2 && isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && !isset($this->addonDic['s'][$smarr[$i + 2]])) {
						$newarr[$j] .= $smarr[$i + 2];
						$i++;
					}
					if (!isset($this->newWords[$newarr[$j]])) {
						$this->SetWordInfos($newarr[$j], array(
							'c' => 1,
							'm' => 'nr'
						));
						$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/nr, ';
					}
					//为了防止错误，保留合并前的姓名
					if (strlen($nw) == 4) {
						$j++;
						$newarr[$j] = chr(0) . chr(0x28);
						$j++;
						$newarr[$j] = $cw;
						$j++;
						$newarr[$j] = $nw;
						$j++;
						$newarr[$j] = chr(0) . chr(0x29);
					}
					
					$j++;
					$i++;
					$ischeck = true;
				}
			}
			//检测后缀词(地名等)
			else if (isset($this->addonDic['a'][$nw])) {
				$is_rs = false;
				//词语是副词或介词不作为前缀
				if (strlen($cw) > 2) {
					$winfos = $this->GetWordInfos($cw);
					if (isset($winfos['m']) && ($winfos['m'] == 'a' || $winfos['m'] == 'r' || $winfos['m'] == 'c' || $winfos['c'] > 500)) {
						$is_rs = true;
					}
				}
				if (!isset($this->addonDic['s'][$cw]) && !$is_rs) {
					$newarr[$j] = $cw . $nw;
					if (!isset($this->newWords[$newarr[$j]])) {
						$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/na, ';
						$this->SetWordInfos($newarr[$j], array(
							'c' => 1,
							'm' => 'na'
						));
					}
					$i++;
					$j++;
					$ischeck = true;
				}
			}
			//新词识别（暂无规则）
			else if ($this->unitWord) {
				if (strlen($cw) == 2 && strlen($nw) == 2 && !isset($this->addonDic['s'][$cw]) && !isset($this->addonDic['t'][$cw]) && !isset($this->addonDic['a'][$cw]) && !isset($this->addonDic['s'][$nw]) && !isset($this->addonDic['c'][$nw])) {
					$newarr[$j] = $cw . $nw;
					//尝试检测第三个词
					if (isset($smarr[$i + 2]) && strlen($smarr[$i + 2]) == 2 && (isset($this->addonDic['a'][$smarr[$i + 2]]) || isset($this->addonDic['u'][$smarr[$i + 2]]))) {
						$newarr[$j] .= $smarr[$i + 2];
						$i++;
					}
					if (!isset($this->newWords[$newarr[$j]])) {
						$this->foundWordStr .= $this->_out_string_encoding($newarr[$j]) . '/ms, ';
						$this->SetWordInfos($newarr[$j], array(
							'c' => 1,
							'm' => 'ms'
						));
					}
					$i++;
					$j++;
					$ischeck = true;
				}
			}
			
			//不符合规则
			if (!$ischeck) {
				$newarr[$j] = $cw;
				//二元消岐处理——最大切分模式
				if ($this->differMax && !isset($this->addonDic['s'][$cw]) && strlen($cw) < 5 && strlen($nw) < 7) {
					$slen    = strlen($nw);
					$hasDiff = false;
					for ($y = 2; $y <= $slen - 2; $y = $y + 2) {
						$nhead = substr($nw, $y - 2, 2);
						$nfont = $cw . substr($nw, 0, $y - 2);
						if ($this->IsWord($nfont . $nhead)) {
							if (strlen($cw) > 2)
								$j++;
							$hasDiff    = true;
							$newarr[$j] = $nfont . $nhead;
						}
					}
				}
				$j++;
			}
			
		} //end for
		$smarr = $newarr;
	}
	
	/**
	 * 转换最终分词结果到 finallyResult 数组
	 * @return void
	 */
	private function _sort_finally_result()
	{
		$newarr = array();
		$i      = 0;
		foreach ($this->simpleResult as $k => $v) {
			if (empty($v['w']))
				continue;
			if (isset($this->finallyResult[$k]) && count($this->finallyResult[$k]) > 0) {
				foreach ($this->finallyResult[$k] as $w) {
					if (!empty($w)) {
						$newarr[$i]['w'] = $w;
						$newarr[$i]['t'] = 20;
						$i++;
					}
				}
			} else if ($v['t'] != 21) {
				$newarr[$i]['w'] = $v['w'];
				$newarr[$i]['t'] = $v['t'];
				$i++;
			}
		}
		$this->finallyResult = $newarr;
		$newarr              = '';
	}
	
	/**
	 * 把uncode字符串转换为输出字符串
	 * @parem str
	 * return string
	 */
	private function _out_string_encoding(&$str)
	{
		$rsc = $this->_source_result_charset();
		if ($rsc == 1) {
			$rsstr = iconv(UCS2, 'utf-8', $str);
		} else if ($rsc == 2) {
			$rsstr = iconv('utf-8', 'gb18030', iconv(UCS2, 'utf-8', $str));
		} else {
			$rsstr = iconv('utf-8', 'big5', iconv(UCS2, 'utf-8', $str));
		}
		return $rsstr;
	}
	
	/**
	 * 获取最终结果字符串（用空格分开后的分词结果）
	 * @return string
	 */
	public function GetFinallyResult($spword = ' ', $word_meanings = false)
	{
		$rsstr = '';
		foreach ($this->finallyResult as $v) {
			if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
				continue;
			}
			$m = '';
			if ($word_meanings) {
				$m = $this->GetWordProperty($v['w']);
			}
			$w = $this->_out_string_encoding($v['w']);
			if ($w != ' ') {
				if ($word_meanings) {
					$rsstr .= $spword . $w . $m;
				} else {
					$rsstr .= $spword . $w;
				}
			}
		}
		return $rsstr;
	}
	
	/**
	 * 获取粗分结果，不包含粗分属性
	 * @return array()
	 */
	public function GetSimpleResult()
	{
		$rearr = array();
		foreach ($this->simpleResult as $k => $v) {
			if (empty($v['w']))
				continue;
			$w = $this->_out_string_encoding($v['w']);
			if ($w != ' ')
				$rearr[] = $w;
		}
		return $rearr;
	}
	
	/**
	 * 获取粗分结果，包含粗分属性（1中文词句、2 ANSI词汇（包括全角），3 ANSI标点符号（包括全角），4数字（包括全角），5 中文标点或无法识别字符）
	 * @return array()
	 */
	public function GetSimpleResultAll()
	{
		$rearr = array();
		foreach ($this->simpleResult as $k => $v) {
			$w = $this->_out_string_encoding($v['w']);
			if ($w != ' ') {
				$rearr[$k]['w'] = $w;
				$rearr[$k]['t'] = $v['t'];
			}
		}
		return $rearr;
	}
	
	/**
	 * 获取索引hash数组
	 * @return array('word'=>count,...)
	 */
	public function GetFinallyIndex()
	{
		$rearr = array();
		foreach ($this->finallyResult as $v) {
			if ($this->resultType == 2 && ($v['t'] == 3 || $v['t'] == 5)) {
				continue;
			}
			$w = $this->_out_string_encoding($v['w']);
			if ($w == ' ') {
				continue;
			}
			if (isset($rearr[$w])) {
				$rearr[$w]++;
			} else {
				$rearr[$w] = 1;
			}
		}
		arsort($rearr);
		return $rearr;
	}
	
	/**
	 * 获取最终关键字(返回用 "," 间隔的关键字)
	 * @return string
	 */
	public function GetFinallyKeywords($num = 10)
	{
		$n     = 0;
		$arr   = $this->GetFinallyIndex();
		$okstr = '';
		foreach ($arr as $k => $v) {
			//排除长度为1的词
			if (strlen($k) == 1) {
				continue;
			}
			//排除长度为2的非英文词
			elseif (strlen($k) == 2 && preg_match('/[^0-9a-zA-Z]/', $k)) {
				continue;
				
			}
			//排除单个中文字
				elseif (strlen($k) < 4 && !preg_match('/[a-zA-Z]/', $k)) {
				continue;
			}
			$okstr .= ($okstr == '' ? $k : ',' . $k);
			$n++;
			if ($n > $num)
				break;
		}
		return $okstr;
	}
	
	/**
	 * 获得保存目标编码
	 * @return int
	 */
	private function _source_result_charset()
	{
		if (preg_match("/^utf/", $this->targetCharSet)) {
			$rs = 1;
		} else if (preg_match("/^gb/", $this->targetCharSet)) {
			$rs = 2;
		} else if (preg_match("/^big/", $this->targetCharSet)) {
			$rs = 3;
		} else {
			$rs = 4;
		}
		return $rs;
	}
	
	/**
	 * 编译词典
	 * @parem $sourcefile utf-8编码的文本词典数据文件<参见范例dict/not-build/base_dic_full.txt>
	 * 注意, 需要PHP开放足够的内存才能完成操作
	 * @return void
	 */
	public function MakeDict($source_file, $target_file = '')
	{
		$target_file = ($target_file == '' ? $this->mainDicFile : $target_file);
		$allk        = array();
		$fp          = fopen($source_file, 'r');
		while ($line = fgets($fp, 64)) {
			if ($line[0] == '@')
				continue;
			list($w, $r, $a) = explode(',', $line);
			/*if( $line='' ) continue;
			$w = $line;
			$r = 1;
			$a = 'n';*/
			
			$a = trim($a);
			$w = iconv('utf-8', UCS2, $w);
			$k = $this->_get_index($w);
			if (isset($allk[$k]))
				$allk[$k][$w] = array(
					$r,
					$a
				);
			else
				$allk[$k][$w] = array(
					$r,
					$a
				);
		}
		fclose($fp);
		$fp         = fopen($target_file, 'w');
		$heade_rarr = array();
		$alldat     = '';
		$start_pos  = $this->mask_value * 8;
		foreach ($allk as $k => $v) {
			$dat  = serialize($v);
			$dlen = strlen($dat);
			$alldat .= $dat;
			
			$heade_rarr[$k][0] = $start_pos;
			$heade_rarr[$k][1] = $dlen;
			$heade_rarr[$k][2] = count($v);
			
			$start_pos += $dlen;
		}
		print_r($heade_rarr);
		unset($allk);
		for ($i = 0; $i < $this->mask_value; $i++) {
			if (!isset($heade_rarr[$i])) {
				$heade_rarr[$i] = array(
					0,
					0,
					0
				);
			}
			fwrite($fp, pack("Inn", $heade_rarr[$i][0], $heade_rarr[$i][1], $heade_rarr[$i][2]));
		}
		fwrite($fp, $alldat);
		fclose($fp);
	}
	
	/**
	 * 导出词典的词条
	 * @parem $targetfile 保存位置
	 * @return void
	 */
	public function ExportDict($targetfile)
	{
		if (!$this->mainDicHand) {
			$this->mainDicHand = fopen($this->mainDicFile, 'r');
		}
		$fp = fopen($targetfile, 'w');
		for ($i = 0; $i <= $this->mask_value; $i++) {
			$move_pos = $i * 8;
			fseek($this->mainDicHand, $move_pos, SEEK_SET);
			$dat = fread($this->mainDicHand, 8);
			$arr = unpack('I1s/n1l/n1c', $dat);
			if ($arr['l'] == 0) {
				continue;
			}
			fseek($this->mainDicHand, $arr['s'], SEEK_SET);
			$data = @unserialize(fread($this->mainDicHand, $arr['l']));
			if (!is_array($data))
				continue;
			foreach ($data as $k => $v) {
				$w = iconv(UCS2, 'utf-8', $k);
				fwrite($fp, "{$w},{$v[0]},{$v[1]}\n");
			}
		}
		fclose($fp);
		return true;
	}
}